#!/usr/bin/R

# Analyse posts made by AfD themselves between 2013 and February 2018


# AfD ID: 540404695989874


library(dplyr)
library(ggplot2)
library(lubridate)
library(ggthemes)
library(stringr)
library(tidyr)
library(tibble)
library(reshape2)
library(MASS)
library(mfp)

library(tm)
library(qdap)
library(quanteda)
library(SnowballC)


startdate="2013-03-11"
enddate="2017-09-24"



wholepage <- readRDS("afdpage-complete")
# How often have the AfD posted?
print(paste("The AfD have posted",length(wholepage[wholepage$from_id==540404695989874,"id"]),"times"))
print(paste("That is a cool",length(wholepage[wholepage$from_id==540404695989874,"id"]) / (as.numeric(difftime(as.Date(head(wholepage[wholepage$from_id==540404695989874,"created_time"],1) ),as.Date(tail(wholepage[wholepage$from_id==540404695989874,"created_time"],1) )))),"per day"))


afdposts <- wholepage %>% filter(from_id==540404695989874)

# re-format dates

afdposts$date <- str_extract(afdposts$created_time,"[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]")


# How often have others posted?

print(paste("Others have posted",(length(wholepage$from_id) - length(wholepage[wholepage$from_id==540404695989874,"id"])),"times"))

# Comments and likes

print("All these posts have attracted:")
print(paste(sum(wholepage$likes_count),"Likes"))
print(paste(sum(wholepage$comments_count),"Comments"))
print(paste(sum(wholepage$shares_count),"Shares"))

# Look only at the AfD posts 
print("The AfD's posts have attracted:")
print(paste(sum(afdposts$likes_count),"Likes"))
print(paste(sum(afdposts$comments_count),"Comments"))
print(paste(sum(afdposts$shares_count),"Shares"))


# Read the post&comments object 


# Filter out posts by AfD

afdposts <- wholepage %>% filter(from_id==540404695989874)

# re-format dates

afdposts$date <- str_extract(afdposts$created_time,"[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]")

# summarise


count_afd_posts <- afdposts %>% group_by(date) %>% summarise(count=n())

summary(count_afd_posts$count)

# Plot posts per day

ggplot(count_afd_posts, aes(x=as.Date(date),y=count)) + geom_line() + theme_tufte() + scale_x_date() + xlab("") + ylab("AfD-Posts / Tag") + coord_fixed(ratio=50)

ggsave("posts-per-day.png",width=6,height=4)

# Look at vocabulary. This works better than quanteda clean function
# Remove URLs, all quotes (hopefully), punctuation, and extra whitespace 
url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"


# Extract URLs

# wrap function around str_extract_all
extracturls <- function(string) {
 result <- unlist(str_extract_all(string=string,pattern=url_pattern))
 if(length(result)==0L) {
     result <- NA
 }
 result <-  unique(result)
 return(result)
  }
# Extract URLs from post. URLs may appear in message field, link field, or both. "unique" in definition of function should take care of this
afdposts$urls <- lapply(paste(afdposts$message,afdposts$link),extracturls)

# seems to miss hyphens in the domain name - corrected
pattern <- "//[a-zA-Z0-9\\.]+\\.([a-zA-Z0-9-]+)\\.[a-zA-Z0-9]+/"

# Extract domains of URLs posted by AfD
afddomains <- str_match(unlist(afdposts$urls),pattern)[,2]
afddomains <- afddomains[!is.na(afddomains)]
paste("The AfD has posted",length(afddomains),"links to",length(table(as.factor(afddomains))),"unique domains")
sort(table(as.factor(afddomains)))



afdposts$cleantext <- str_replace_all(afdposts$message,pattern=url_pattern,replacement= " ")
afdposts$cleantext <- sapply(afdposts$cleantext,tolower)
afdposts$cleantext <- gsub("\""," ", afdposts$cleantext)
afdposts$cleantext <- gsub("\'"," ", afdposts$cleantext)
afdposts$cleantext <- gsub("\\+"," ", afdposts$cleantext)
afdposts$cleantext <- gsub("[[:punct:]]"," ", afdposts$cleantext)
afdposts$cleantext<- gsub("[[:space:]]+"," ", afdposts$cleantext)

# remove "afd" and "alternative fuer deutschland"
afdposts$cleantext.stopped <- gsub("afd","",afdposts$cleantext,fixed=TRUE)
afdposts$cleantext.stopped <- gsub("alternative für deutschland","",afdposts$cleantext.stopped,fixed=TRUE)
# remove stopwords
mystopwords <- c(stopwords("german"),"dass","für")
#afdposts$cleantext.stopped <- removeFeatures(afdposts$cleantext.stopped, stopwords = mystopwords)

# Hopefully getting rid of remaining tricky characters
afdposts$cleantext.stopped.graph <- str_replace_all(afdposts$cleantext.stopped,"[^[:graph:]]", " ") 

# Look for Issues


afdposts$haseuro <- 0
afdposts$hasgr <- 0
afdposts$hasislam <- 0
afdposts$hasmigration <- 0
afdposts$haspegida <- 0
afdposts$hascrime <- 0
afdposts$hasgender <- 0
afdposts$hasrussiaukr <- 0
afdposts$hasturkey <- 0
afdposts$hasbulruman <- 0
afdposts$haswelfare <- 0

afdposts$haseuro[grep("euro",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$hasgr[grep("athen|griech",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$hasislam[grep("moslem|muslim|islam|kopftuch|scharia|ihad",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$hasmigration[grep("(zu|ein)wander|migrant|asyl|fl[üue]+cht|ausländer",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$haspegida[grep("gida",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$hascrime[grep("krimin|verbrech|straft",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$hasrussiaukr[grep("ruß|russ|ukrain",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$hasturkey[grep("türk|erdog",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$hasbulruman[grep("bulgar|rumän",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$haswelfare[grep("wohlfahrts|sozialssyt|sozialleist|sozialtour",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$hasrussiaukr[grep("russ|ruß|ukrain",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1
afdposts$hasgender[grep("schwul|homo|gender|schwuch|lesb",afdposts$cleantext.stopped.graph,ignore.case = TRUE,perl=TRUE)] <- 1

# Aggregate by Day

afdposts[,"ddate"] <-  strftime(afdposts$created_time,format="%Y-%m-%d")

issues <- afdposts %>% group_by(ddate) %>% summarise(mean(haseuro),mean(hasgr),mean(hasislam),mean(hasmigration),mean(haspegida),mean(hascrime),mean(hasgender),mean(hasrussiaukr),mean(hasturkey),mean(hasbulruman),mean(haswelfare) )

# spice up issue frame
names(issues) <- c("Datum","Euro","Griechenland","Islam","Migration","Pegida","Crime","Gender","Russland","Türkei","Bulgarien/Rumänien","Sozialstaat")
issues <-  as_tibble(melt(issues,id="Datum"))
issues$variable <- as.character(issues$variable)

# Multiply * 100 to get per cent

issues$value <- issues$value * 100



# Generate Plot-Date
issues$plotdate <- ymd(issues$Datum)
# Plot (smoothed)

names(issues) <- c("Datum","Thema","value","plotdate")

## issues %>% filter(grepl("Euro|Griechenland|Islam|Migration", Thema)) %>% ggplot(aes(x=plotdate,y=value,color=Thema)) + theme_tufte() + scale_x_date() + xlab("") + ylab("Anteil Thema / AfD-Posts") + geom_smooth(se=FALSE) 

## ggsave("themen.png",width=6,height=4)

# Do that in English, too 

issues$Issue <- issues$Thema

issues$Issue[issues$Issue == "Griechenland"] <- "Greece"


# Actually, we should filter out dates after the 2017 election for article
# [issues$Datum<dmy("25-09-2017"),]

issues[issues$Datum<dmy("25-09-2017"),] %>% filter(grepl("Euro|Greece|Islam|Migration", Issue)) %>% ggplot(aes(x=plotdate,y=value,linetype=Issue)) + theme_tufte() + scale_x_date() + xlab("") + ylab("% Issue / AfD-Posts") + geom_smooth(se=FALSE,color="black") + theme(legend.key.width=unit(4,"line")) +  scale_linetype_manual(values=c("solid", "dotted","dashed","dotdash")) + scale_color_colorblind()

ggsave("issues.pdf",width=6,height=4)

# is the smoothing hiding something?

# Simply aggregate by year 
issues$year <- year(issues$plotdate)

issues %>% filter(grepl("Euro|Greece|Islam|Migration", Issue)) %>% group_by(year,Issue) %>% summarise(imean = mean(value,rm.na=TRUE)) %>%ggplot(aes(x=year,y=imean,color=Issue)) + geom_line(se=FALSE) + geom_point() + theme_tufte() +  xlab("") + ylab("% Issue / AfD-Posts") + scale_color_colorblind()

ggsave("yearly-issues.pdf",width=6,height=4)

